package com.lwl.office.webeditor;

import cn.hutool.core.io.resource.ResourceUtil;
import com.lwl.office.webeditor.parser.ParserSelector;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.List;

/**
 * html转word
 */
public class Html2WordConvert {
    /**
     * html转换成word
     * @param html
     */
    public static XWPFDocument convert(String html){
        ParserContext context = new ParserContext();
        html = html.replace("&nbsp:"," ");
        html = html.replaceAll("\\n","");
        html = html.replaceAll("\\p{Cntrl}","");
        Element element = Jsoup.parse(html);
        List<Node> nodes = element.getElementsByTag("body").get(0).childNodesCopy();
        for (Node node : nodes) {
            ParserSelector.select(node).parser(context,null);
        }
        return context.getWord();
    }

    public static void main(String[] args) throws IOException {
        String html = ResourceUtil.readUtf8Str("word.html");
        XWPFDocument document = convert(html);
        File file = new File("/file/out.docx");
        if(!file.getParentFile().exists()){
            file.getParentFile().mkdirs();
        }
        if(!file.exists()){
            file.createNewFile();
        }
        OutputStream out = new FileOutputStream(file);
        document.write(out);
        out.flush();
        out.close();
    }
}
